Import Packages

Specify Paths

# inputs
dataset_path <- "./Data/"
function_path <- "./Functions/"

# outputs
itrt_plot_path <- "./Output/InteractivePlot/"
sttc_plot_path <- "./Output/StaticPlot/"
out_data_path <- "./Output/Data/"

Import Dataset

daily_covid <- import(
    paste0(dataset_path, "worldometer_coronavirus_daily_data.csv")
    )

summary_covid <- import(
    paste0(dataset_path, "worldometer_coronavirus_summary_data.csv")
    )

head(daily_covid)
##         date     country cumulative_total_cases daily_new_cases active_cases
## 1 2020-02-15 Afghanistan                      0              NA            0
## 2 2020-02-16 Afghanistan                      0              NA            0
## 3 2020-02-17 Afghanistan                      0              NA            0
## 4 2020-02-18 Afghanistan                      0              NA            0
## 5 2020-02-19 Afghanistan                      0              NA            0
## 6 2020-02-20 Afghanistan                      0              NA            0
##   cumulative_total_deaths daily_new_deaths
## 1                       0               NA
## 2                       0               NA
## 3                       0               NA
## 4                       0               NA
## 5                       0               NA
## 6                       0               NA
head(summary_covid)
##       country     continent total_confirmed total_deaths total_recovered
## 1 Afghanistan          Asia          158275         7367          145750
## 2     Albania        Europe          213257         3228          202077
## 3     Algeria        Africa          220415         6310          151347
## 4     Andorra        Europe           25289          141           21511
## 5      Angola        Africa           86636         1789           67477
## 6    Anguilla North America            1777            6            1702
##   active_cases serious_or_critical total_cases_per_1m_population
## 1         5158                1124                          3932
## 2         7952                  23                         74227
## 3        62758                  34                          4893
## 4         3637                  31                        326512
## 5        17370                   7                          2518
## 6           69                  NA                        116869
##   total_deaths_per_1m_population total_tests total_tests_per_1m_population
## 1                            183      826810                         20541
## 2                           1124     1495002                        520354
## 3                            140      230861                          5125
## 4                           1820      249838                       3225714
## 5                             52     1296669                         37686
## 6                            395       51382                       3379283
##   population
## 1   40250878
## 2    2873049
## 3   45046063
## 4      77452
## 5   34407243
## 6      15205

Data Cleaning

# daily_covid[is.na(d)] <- 0

Data Manipulation and visualization

Generic Questions and Answers

Question 1

What is the overview of covid cases during that period of time?

## global percentage of death, active case and recovered ##

# sum vertically
categories <- c("total_deaths", "total_recovered", "active_cases")
category <- str_replace_all(categories, pattern =  "_", replacement = " ")
category <- str_to_title(category)

data <- 
    summary_covid[, categories] %>%
    colSums(na.rm = T)
data <- data.frame(
  category=category,
  count=data
)

data$prettyCount <- prettyNum(data$count, big.mark = ",", scientific = F)

# Compute percentages
data$fraction <- data$count / sum(data$count)

# Compute the cumulative percentages (top of each rectangle)
data$ymax <- cumsum(data$fraction)

# Compute the bottom of each rectangle
data$ymin <- c(0, head(data$ymax, n=-1))

# Compute label position
data$labelPosition <- (data$ymax + data$ymin) / 2

# Compute display percentages
data$prettyFraction <- percent(data$fraction)

# Make the plot
q1 <- 
    ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=category)) +
    geom_rect() +
    geom_text( 
        x=4.3, 
        aes(y=labelPosition, label=prettyCount, color=category, fontface="bold"), 
        size=3.5
    ) + # x here controls label position (inner / outer)
    geom_text(
        x=3.5, 
        aes(y=labelPosition, label=prettyFraction, fontface="bold"), 
        color="white",
        size=4
    ) +
    scale_fill_brewer(palette="Set2") +
    scale_color_brewer(palette="Set2") +
    coord_polar(theta="y") +
    xlim(c(2, 4)) +
    theme_void() +
    annotate(
        geom = "text", 
        x = 2, 
        y = 0, 
        colour = "#eba834",
        label = paste0(
            "Total Cases\n", 
            prettyNum(sum(data$count), big.mark = ",", scientific = F)
        )
    )

ggsave(paste0(sttc_plot_path, "q1_pie.png"))
## Saving 7 x 5 in image
q1

Question 2

What are the general scale of cases within different continent or country?

## comparison of cases between different continent ##

q2 <- 
    summary_covid %>% # data
    select(country:active_cases) %>%
    group_by(continent) %>% # group_by 
    filter(total_confirmed > quantile(total_confirmed, 0.7)) %>% # removing small cases
    ungroup() %>% 
    group_by(continent, country) %>% 
    # turning 3 columes into sub sub group (wide to long conversion)
    gather(category, count, total_recovered, active_cases, total_deaths, factor_key=T) %>% 
    ungroup() %>% 
    mutate(category = factor(category, labels = c("Recovered", "Active Cases", "Deaths"))) %>% 
    treemap( index=c("continent","country","category"),
             vSize="count",
             type="index",
             palette = "Set2",
             title = "Group by continent top 70 percentile confirmed cases",
             align.labels=list(
                 c("center", "center"),
                 c("left", "top"),
                 c("left", "bottom")
             )
        )

itrt_q2 <- d3tree2( q2 ,  rootname = "Group by continent top 70 percentile confirmed cases" )

saveWidget(itrt_q2, file = paste0(itrt_plot_path, "q2"))
itrt_q2

Question 3

What is the spreading trend in that period of time?

# overview of accumulated cases vs date for all the country
# 

Question 4

Which country has the most cases?

# ranking of cases for the top n countries
# hist

Question 5

How active cases have change during that period of time for each country?

# video hist

More in depth Questions

Question 1

We can easily tell from previous plots that most of the cases are from big countries. What is the relation between population and Covid cases?

# we can easily tell from previous plots that most of the cases are from big
# countries. Now, I'm curious about the relation between population & Covid cases

q11 <- summary_covid %>%
    
    # Reorder countries to having big bubbles at the back
    arrange(desc(population)) %>%
    
    # prepare text for tooltip
    mutate(text = 
               paste0(
                   "Country: ", country, 
                   "\nPopulation: ", commaNum(population), 
                   "\nTotal Cases:\t", commaNum(total_confirmed), 
                   "\nTotal Tests\t", commaNum(total_tests)
               )
           ) %>%
    
    ggplot( 
        aes(
            x = population, 
            y = total_confirmed, 
            fill = continent, 
            size = total_tests, 
            text = text
        )
    ) +
    geom_point(alpha=0.5, color = "black", shape = 21, na.rm = T) +
    scale_x_log10(
        labels = unit_format(unit = "M", scale = 1e-6),
        breaks = 1e+3 * 10^(seq(0,20,2)),
    ) +
    scale_y_log10(
        labels = unit_format(unit = "M", scale = 1e-6),
        breaks = 10^(seq(1,21,2)),
    ) +
    scale_size(range = c(2, 25), name="Total Tests (M), Size") +
    labs(fill = 'Continent, Color') +
    scale_fill_viridis(discrete=T, option = "D") +
    coord_cartesian(clip = "off") +
    ylab("Covid Cases (M), log10(n)") +
    xlab("Population (M), log10(n)") +
    theme_bw()

ggsave(paste0(sttc_plot_path, "q11_bubble.png"))
## Saving 12 x 6 in image
# turn interactive ggplot with plotly and save it
itrt_q11 <- ggplotly(q11, tooltip="text")
saveWidget(itrt_q11, file = paste0(itrt_plot_path, "q11"))

q11

itrt_q11

Question 2

Which country did well in this pandemic war?

Question 3

can weather, geometric position affect the spread of covid?

Question 4

an even more in depth question, we’ve seen that population definitely affect how covid spread, let’s check and see why social distancing is needed.

clean up

pacman::p_unload(all)
## The following packages have been unloaded:
## d3treeR, htmlwidgets, treemap, plotly, hrbrthemes, viridis, viridisLite, scales, stringr, ggplot2, tidyr, dplyr, rio, installr, pacman